Working Directory: /Users/jamesdiao/Documents/Kohane_Lab/2016-paper-ACMG-penetrance/ACMG_Penetrance

\newpage

1 Download, Transform, and Load Data

1.1 Collect ACMG Gene Panel

http://www.ncbi.nlm.nih.gov/clinvar/docs/acmg/

ACMG.panel <- ACMG.table[,"Gene_Name"] %>% unique
cat(sprintf("Processed Table from ACMG Website %s x %s (selected rows):", nrow(ACMG.table), ncol(ACMG.table)))
## Processed Table from ACMG Website 64 x 4 (selected rows):
row.names(ACMG.table) <- paste0("N",1:nrow(ACMG.table))
ACMG.table[-c(3:4,6:9,15,19,21:27,29,33:36,38:39,41:43,45,47,49:50,54:56,62),] %>% pander
  Disease_Name Disease_MIM Gene_Name Gene_MIM
N1 Adenomatous polyposis coli 175100 APC 611731
N2 Aortic aneurysm, familial thoracic 4 132900 MYH11 160745
N5 Arrhythmogenic right ventricular cardiomyopathy, type 5 604400 TMEM43 612048
N10 Breast-ovarian cancer, familial 1 604370 BRCA1 113705
N11 Breast-ovarian cancer, familial 2 612555 BRCA2 600185
N12 Brugada syndrome 1 601144 SCN5A 600163
N13 Catecholaminergic polymorphic ventricular tachycardia 604772 RYR2 180902
N14 Dilated cardiomyopathy 1A 115200 LMNA 150330
N16 Ehlers-Danlos syndrome, type 4 130050 COL3A1 120180
N17 Fabry’s disease 301500 GLA 300644
N18 Familial hypercholesterolemia 143890 APOB 107730
N20 Familial hypertrophic cardiomyopathy 1 192600 MYH7 160760
N28 Familial medullary thyroid carcinoma 155240 RET 164761
N30 Left ventricular noncompaction 6 601494 TNNT2 191045
N31 Li-Fraumeni syndrome 1 151623 TP53 191170
N32 Loeys-Dietz syndrome type 1A 609192 TGFBR1 190181
N37 Long QT syndrome 1 192500 KCNQ1 607542
N40 Lynch syndrome 120435 MLH1 120436
N44 Malignant hyperthermia 145600 RYR1 180901
N46 Marfan’s syndrome 154700 FBN1 134797
N48 Multiple endocrine neoplasia, type 1 131100 MEN1 613733
N51 MYH-associated polyposis 608456 MUTYH 604933
N52 Neurofibromatosis, type 2 101000 NF2 607379
N53 Paragangliomas 1 168000 SDHD 602690
N57 Peutz-Jeghers syndrome 175200 STK11 602216
N58 Pilomatrixoma 132600 MUTYH 604933
N59 PTEN hamartoma tumor syndrome 153480 PTEN 601728
N60 Retinoblastoma 180200 RB1 614041
N61 Tuberous sclerosis 1 191100 TSC1 605284
N63 Von Hippel-Lindau syndrome 193300 VHL 608537
N64 Wilms’ tumor 194070 WT1 607102
cat("ACMG-56 Genes:")
## ACMG-56 Genes:
print(ACMG.panel, quote = F)
##  [1] APC     MYH11   ACTA2   MYLK    TMEM43  DSP     PKP2    DSG2   
##  [9] DSC2    BRCA1   BRCA2   SCN5A   RYR2    LMNA    MYBPC3  COL3A1 
## [17] GLA     APOB    LDLR    MYH7    TPM1    PRKAG2  TNNI3   MYL3   
## [25] MYL2    ACTC1   RET     PCSK9   TNNT2   TP53    TGFBR1  TGFBR2 
## [33] SMAD3   KCNQ1   KCNH2   MLH1    MSH2    MSH6    PMS2    RYR1   
## [41] CACNA1S FBN1    MEN1    MUTYH   NF2     SDHD    SDHAF2  SDHC   
## [49] SDHB    STK11   PTEN    RB1     TSC1    TSC2    VHL     WT1


1.2 Download ClinVar VCF

ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/clinvar.vcf.gz


ClinVar is the central repository for variant interpretations. Relevant information from the VCF includes:
(a) CLNSIG = “Variant Clinical Significance, 0 - Uncertain, 1 - Not provided, 2 - Benign, 3 - Likely benign,
4 - Likely pathogenic, 5 - Pathogenic, 6 - Drug response, 7 - Histocompatibility, 255 - Other”
(b) CLNDBN = “Variant disease name”
(c) CLNDSDBID = “Variant disease database ID”
(d) INTERP = Pathogenicity (likely pathogenic or pathogenic; CLNSIG = 4 or 5)

#Number to interpretation
clnsig_map <- c(0:7,255,-1) %>% setNames(c("Uncertain",
  "Not_Provided","Benign","Likely_Benign","Likely_Pathogenic",
  "Pathogenic","Drug_Response","Histocompatibility","Other","NA")) 

get_clinvar <- function(clinvar_file) {
  file.by.line <- readLines(clinvar_file)
  #file_date <- as.Date(strsplit(file.by.line[2],"=")[[1]][2], "%Y%m%d")
  #system(sprintf("mv %s ClinVar_Reports/clinvar_%s.vcf", clinvar_file, file_date))
  clean.lines <- file.by.line[!grepl("##.*", file.by.line)] #Remove ## comments
  clean.lines[1] <- sub('.', '', clean.lines[1]) #Remove # from header
  input <- read.table(text = paste(clean.lines, collapse = "\n"), header = T, stringsAsFactors = F, 
                      comment.char = "", quote = "", sep = "\t")
  input <- input[nchar(input$REF)==1,] #deletions
  alt_num <- sapply(strsplit(input$ALT,","),length) #number of alts
  acceptable_nchar <- 2*alt_num-1 #adds in the length from commas, if each alt is 1 nt.
  input <- input[nchar(input$ALT)==acceptable_nchar,] #insertions
  input$ALT <- strsplit(input$ALT,",")
  split_all <- strsplit(input$INFO,";")
  has.clndsdbid <- any(grepl("CLNDSDBID", split_all))
  
  split_info <- function(name) {
    sapply(split_all, function(entry) {
      entry[grep(name,entry)]
    }) %>% strsplit("=") %>% sapply(function(x) x[2]) %>% strsplit(",")
  }
  input$CLNALLE <- split_info("CLNALLE=") %>% sapply(as.integer)
  input$CLNSIG <- split_info("CLNSIG=")
  input$CLNDBN <- split_info("CLNDBN=")
  if (has.clndsdbid)
    input$CLNDSDBID <- split_info("CLNDSDBID=")
  #CLNALLE has 0,-1,3,4 --> CLNSIG has 1,2,3,4 --> ALT has 1. 
  taking <- sapply(input$CLNALLE, function(x) x[x>0] ) #Actual elements > 0. Keep these in CLNSIG and ALT 
  taking_loc <- sapply(input$CLNALLE, function(x) which(x>0) )#Tracks locations for keeping in CLNALLE
  keep <- sapply(taking, length)>0 #reduce everything to get rid of 0 and -1
  # Reduce, reduce, reduce. 
  taking <- taking[keep]
  taking_loc <- taking_loc[keep]
  input <- input[keep,]
  
  #Make this more readable
  input$ALT <- sapply(1:nrow(input), function(row) {
    input$ALT[[row]][taking[[row]]]
  })
  
  col_subset <- function(name) {
    sapply(1:nrow(input), function(row) {
      unlist(input[row,name])[taking_loc[[row]]]
    })
  }
  input$CLNSIG <- col_subset("CLNSIG")
  input$CLNALLE <- col_subset("CLNALLE")
  input$CLNDBN <- col_subset("CLNDBN")
  if (has.clndsdbid)
    input$CLNDSDBID <- col_subset("CLNDSDBID")
  filter_condition <- input[,unlist(lapply(input, typeof))=="list"] %>% 
    apply(1,function(row) !any(is.na(row)))
  input <- input %>% filter(filter_condition) %>%
    unnest %>% unite(VAR_ID, CHROM, POS, REF, ALT, sep = "_", remove = F) %>%
    select(VAR_ID, CHROM, POS, ID, REF, ALT, CLNALLE, CLNSIG, everything()) %>% 
    mutate(CLNSIG = strsplit(CLNSIG,"|",fixed = T)) %>% 
    mutate(CLNDBN = strsplit(CLNDBN,"|",fixed = T)) %>% 
    mutate(POS = as.integer(POS))
  if (has.clndsdbid)
    input <- input %>% mutate(CLNDSDBID = strsplit(CLNDSDBID,"|",fixed = T)) 
  input$CLNSIG <- sapply(input$CLNSIG, function(x) as.integer(x))
  input$INTERP <- sapply(input$CLNSIG, function(x) any(x %in% c(4,5)) ) 
  input
}
clinvar <- get_clinvar(clinvar_file)
cat(sprintf("Processed ClinVar data frame %s x %s (selected rows/columns):", nrow(clinvar), ncol(clinvar)))
## Processed ClinVar data frame 126349 x 14 (selected rows/columns):
clinvar[1:6,] %>% select(-CLNALLE, -INFO, -QUAL, -FILTER) %>% remove_rownames %>% pander
VAR_ID CHROM POS ID REF ALT CLNSIG
1_949523_C_T 1 949523 rs786201005 C T 5
1_949739_G_T 1 949739 rs672601312 G T 5
1_955597_G_T 1 955597 rs115173026 G T 2
1_955619_G_C 1 955619 rs201073369 G C 255
1_957568_A_G 1 957568 rs115704555 A G 2
1_957605_G_A 1 957605 rs756623659 G A 5

Table continues below

CLNDBN CLNDSDBID INTERP
Immunodeficiency_38_with_basal_ganglia_calcification CN221808:616126 TRUE
Immunodeficiency_38_with_basal_ganglia_calcification CN221808:616126 TRUE
not_specified CN169374 FALSE
not_specified CN169374 FALSE
not_specified CN169374 FALSE
Congenital_myasthenic_syndrome C0751882:ORPHA590 TRUE


1.3 Download 1000 Genomes VCFs

ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/ALL.[chrom].phase3_[version].20130502.genotypes.vcf.gz


Downloaded 1000 Genomes VCFs are saved in: /Users/jamesdiao/Documents/Kohane_Lab/2016-paper-ACMG-penetrance/1000G/

download_1000g <- function(gene, download) {
  #for tracking: #gene %>% paste(which(ACMG.panel==gene)) %>% paste(length(ACMG.panel), sep = "/") %>% print
  success <- FALSE
  refGene <- sprintf("select * from refGene where name2 = \"%s\" limit 20", gene) %>% query
  UCSC <- select(refGene, name, chrom, start = txStart, end = txEnd)
  if (nrow(UCSC) == 0) { #No hit on refGene
    return(rep("NOT_FOUND",5) %>% setNames(c("name","chrom","start","end","downloaded")))
  } else {
    if (nrow(UCSC) > 1) #Multiple hits: take the widest range
      UCSC <- UCSC[which.max(UCSC$end-UCSC$start),]
    if (download) {
    # gets [n] from chr[n]
    chrom.num <- strsplit(UCSC$chrom, split = "chr")[[1]][2]
    # different version for chromosomes X and Y
    version <- switch(chrom.num, "X" = "shapeit2_mvncall_integrated_v1b",
                      "Y" = "integrated_v2a", "shapeit2_mvncall_integrated_v5a")
    command <- paste("tabix -h ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/ALL.%s.",
                     "phase3_%s.20130502.genotypes.vcf.gz %s:%s-%s > %s_genotypes.vcf", sep = "")
    sprintf(command, UCSC$chrom, version, chrom.num, UCSC$start, UCSC$end, gene) %>% system
    Sys.sleep(2)
    # Checks whether the file exists and has non-zero size
    exists <- grepl(paste(gene,"_genotypes.vcf",sep =""), system("ls", intern = T)) %>% sum > 0
    file.size <- strsplit(paste("stat ","_genotypes.vcf", sep = gene) %>% 
                            system(intern = T), " ")[[1]][8]
    success <- exists & file.size > 0
    }
  }
  return(c(UCSC,"downloaded" = success))
}

if (!skip_download & !skip_processing) {
  system("mkdir 1000G")
  setwd(paste(getwd(), "1000G", sep = "/"))
  for (con in dbListConnections(MySQL())) dbDisconnect(con)
  con <- dbConnect(MySQL(), user = 'genome',
                   dbname = 'hg19', host = 'genome-mysql.cse.ucsc.edu',
                   unix.sock = "/Applications/MAMP/tmp/mysql/mysql.sock")
  query <- function (input) { suppressWarnings(dbGetQuery(con, input)) }
  download_output <- sapply(ACMG.panel, function(gene) download_1000g(gene, download = T)) %>% t
  print(download_output, quote = F)
  download_output <- download_output %>% 
    apply(2, unlist) %>% 
    as.data.frame(stringsAsFactors = F) %>% 
    mutate("gene" = rownames(download_output)) %>% 
    select(gene, everything()) %>% 
    filter(downloaded != "NOT_FOUND")
  download_output <- download_output %>%
    mutate(chrom = sapply(strsplit(download_output$chrom,"chr"), function(x) x[2]), 
           start = as.integer(start), end = as.integer(end), 
           downloaded = as.logical(downloaded))
  write.table(download_output, file = "download_output.txt", 
              row.names = F, col.names = T, quote = F, sep = "\t")
  system("rm *.genotypes.vcf.gz.tbi")
} else {
  if (skip_download)
    download_output <- read.table("1000G/download_output.txt", header = T, stringsAsFactors = F)
  else
    download_output <- read.table("Supplementary_Files/download_output.txt", header = T, stringsAsFactors = F)
}
cat(sprintf("Download report: region and successes: %s x %s (selected rows):", nrow(download_output), ncol(download_output)))
## Download report: region and successes: 56 x 6 (selected rows):
download_output[1:5,] %>% format(scientific = F) %>% pander
gene name chrom start end downloaded
APC NM_001127511 5 112043201 112181936 TRUE
MYH11 NM_001040113 16 15796991 15950887 TRUE
ACTA2 NM_001141945 10 90694830 90751154 TRUE
MYLK NM_001321309 3 123331142 123603149 TRUE
TMEM43 NM_024334 3 14166439 14185180 TRUE
cat("File saved as download_output.txt in Supplementary_Files")
## File saved as download_output.txt in Supplementary_Files


\newpage

1.4 Collect 1000 Genomes Phase 3 Populations Map

This will allow us to assign genotypes from the 1000 Genomes VCF to ancestral groups.
From: ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/integrated_call_samples_v3.20130502.ALL.panel

#read the map and delete the file
map <- read.table(file = "Supplementary_Files/phase3map.txt", stringsAsFactors = F, header = T) %>% as.data.frame
#display
cat("Phase 3 Populations Map Table: 2504 x 4 (selected rows)")
## Phase 3 Populations Map Table: 2504 x 4 (selected rows)
map[sample(nrow(map),10),] %>% arrange(super_pop) %>% remove_rownames %>% pander
sample pop super_pop gender
HG02554 ACB AFR male
HG02292 PEL AMR female
NA19684 MXL AMR female
NA18945 JPT EAS male
NA19086 JPT EAS male
HG02180 CDX EAS female
HG00266 FIN EUR female
HG01537 IBS EUR female
HG03019 PJL SAS female
NA20881 GIH SAS female
#Make list of populations and superpopulations for later plotting
pop.table <- map[!duplicated(map$pop),] %>% 
  select(contains("pop")) %>% arrange(super_pop, pop)
super <- pop.table$super_pop %>% setNames(pop.table$pop)
super.levels <- unique(pop.table$super_pop)
pop.levels <- unique(pop.table$pop)
#Plot distribution of ancestral backgrounds
Population = factor(as.character(map$pop), levels = pop.levels)
cat("Population Distribution")
## Population Distribution
ggplot(map, aes(map$super_pop, fill = Population)) + 
  geom_bar(color = 'black', width = 0.5) + 
  ylab ("No. of Individuals") + xlab ("Superpopulation") + 
  ggtitle("1000 Genomes - Samples by Population")

rm(Population)


\newpage

1.5 Import and Process 1000 Genomes VCFs

  1. Unnest the data frames to 1 row per variant_ID key (CHROM_POSITION_REF_ALT).
  2. Remove all insertions, deletions, CNV, etc, and keep only missense variants (1 REF, 1 ALT)
  3. For 1000 Genomes: convert genomes to allele counts. For example: (0|1) becomes 1, (1|1) becomes 2.
    Multiple alleles are unnested into multiple counts. For example: (0|2) becomes 0 for the first allele (no 1s) and 1 for the second allele (one 2).
import.file.1000g <- function(gene) {
  sprintf("%s [%s/%s]", gene, grep(gene, ACMG.panel), length(ACMG.panel)) %>% cat
  name <- paste("1000G",paste(gene,"genotypes.vcf", sep = "_"), sep = "/")
  output <- read.table(paste(getwd(),name,sep="/"), stringsAsFactors = FALSE)
  #Add header
  names(output)[1:length(header)] <- header
  #Remove all single alt indels
  output <- output[nchar(output$REF)==1,] #deletions
  alt_num <- sapply(strsplit(output$ALT,","),length) #number of alts
  acceptable_nchar <- 2*alt_num-1 #adds in the length from commas, if each alt is 1 nt.
  output <- output[nchar(output$ALT)==acceptable_nchar,] #insertions
  alt_num <- sapply(strsplit(output$ALT,","),length) #recalculate
  paired = which(alt_num!=1) #all with ,
  #Add AF Column
  af <- strsplit(output$INFO,";") %>% sapply("[", 2) %>% 
    strsplit("AF=") %>% sapply("[", 2) %>% strsplit(",") %>% sapply(as.numeric)
  output <- cbind(GENE = gene, "AF_1000G"=I(af), output) #Places it at the front of output
  front_cols <- 1:(grep("HG00096",colnames(output))-1)
  if (length(paired)!=0) {
    #Limit max vector length by sapply(strsplit(output$ALT,","),length)
    sapply(paired, function(rownum) { #For every row
      sapply(as.character(1:alt_num[rownum]), function(num) {
        grepl(paste(num,"|",sep = ""), output[rownum,-front_cols], fixed=T) +
        grepl(paste("|",num,sep = ""), output[rownum,-front_cols], fixed=T)
      }) %>% t -> temp
      split(temp, rep(1:ncol(temp), each = nrow(temp))) %>% setNames(NULL) 
      #Separate into list of vectors (1 entry for counting each ALT)
    }) %>% t -> insert
    insert <- cbind(output[paired,front_cols],insert)
    colnames(insert) <- colnames(output)
    insert <- insert %>% #adds front_col info
      mutate(ALT = strsplit(ALT,",")) %>% #Splits ALTS
      unnest() %>% #Unnests everything
      select(GENE, AF_1000G, CHROM, POS, ID, REF, ALT, everything()) #Reorders everything
    output <- output[-paired,] #Removes paired
  }
  output <- cbind(output[,front_cols],
                  apply(output[,-front_cols], 2, function(y) {
                    grepl("1|", y, fixed=T) +
                    grepl("|1", y, fixed=T)
                  }) ) #convert to logical
  if (length(paired)!=0)
    output <- rbind(output, insert) #joins the two
  output$AF_1000G <- as.numeric(output$AF_1000G)
  unite(output, VAR_ID, CHROM, POS, REF, ALT, sep = "_", remove = F) %>% arrange(VAR_ID)
  #Make VAR_ID, arrange by VAR_ID
}

if (skip_processing) {
  load(file = "ACMG_1000G")
} else {
  # Import 1000G data for all ACMG
  ACMG.1000g <- NULL
  header <- c("CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT", as.character(map$sample))
  for (gene in ACMG.panel) {
    #print(sprintf("[%d/%d] %s",which(gene==ACMG.panel),length(ACMG.panel),gene))
    ACMG.1000g <- rbind(ACMG.1000g,import.file.1000g(gene))
  }
  #Display and remove duplicates
  ACMG.1000g <- ACMG.1000g[!duplicated(ACMG.1000g$VAR_ID),]
}
cat(sprintf("Processed 1000 Genomes VCFs: %s x %s (selected rows/columns):", nrow(ACMG.1000g), ncol(ACMG.1000g)))
## Processed 1000 Genomes VCFs: 139335 x 2516 (selected rows/columns):
ACMG.1000g[1:5,1:18] %>% select(-INFO, -QUAL, -FILTER, -FORMAT) %>% 
  format(scientific = F) %>% pander
GENE AF_1000G VAR_ID CHROM POS ID REF ALT
APC 0.000199681 5_112043211_A_G 5 112043211 rs554351451 A G
APC 0.000199681 5_112043231_G_A 5 112043231 rs575784409 G A
APC 0.005391370 5_112043234_C_T 5 112043234 rs115658307 C T
APC 0.000199681 5_112043252_G_A 5 112043252 rs558562104 G A
APC 0.008785940 5_112043263_C_T 5 112043263 rs138386816 C T

Table continues below

HG00096 HG00097 HG00099 HG00100 HG00101 HG00102
0 0 0 0 0 0
0 0 0 0 0 0
0 0 0 0 0 0
0 0 0 0 0 0
0 0 0 0 0 0
if (rm_rs1805124)
  ACMG.1000g <- ACMG.1000g[ACMG.1000g$ID!="rs1805124",]


1.6 Import and Process ExAC VCFs

  1. Unnest the data frames to 1 row per variant_ID key (CHROM_POSITION_REF_ALT).
  2. Remove all insertions, deletions, CNV, etc, and keep only missense variants (1 REF, 1 ALT)
  3. Collect superpopulation-level allele frequencies:
    African = AFR, Latino = AMR, European (Finnish + Non-Finnish) = EUR, East.Asian = EAS, South.Asian = SAS.
import.file.exac <- function(gene) {
  file_name <- paste("ExAC",paste(gene,"exac.csv", sep = "_"), sep= "/")
  output <- read.csv(file_name, stringsAsFactors = FALSE)
  output$Number.of.Hemizygotes <- NULL #Inconsistently present column; removal allows row aggregation
  output <- cbind(GENE = gene, output[nchar(paste(output$Alternate,output$Reference))==3,]) %>% 
    select(GENE, AF_EXAC = Allele.Frequency, CHROM=Chrom, POS=Position, 
           ID=RSID, REF=Reference, ALT=Alternate, everything()) %>% 
    unite(VAR_ID, CHROM, POS, REF, ALT, sep = "_", remove = F) %>% arrange(VAR_ID)
  tags <- list("African","Latino","East.Asian","European","South.Asian")
  output$Allele.Count.European <- output$Allele.Count.European..Non.Finnish. + output$Allele.Count.Finnish
  output$Allele.Number.European <- output$Allele.Number.European..Non.Finnish. + output$Allele.Number.Finnish
  exac_af <- output[,sprintf("Allele.Count.%s", tags)] / output[,sprintf("Allele.Number.%s", tags)]
  colnames(exac_af) <- sprintf("AF_EXAC_%s", c("AFR","AMR","EAS","EUR","SAS"))
  output <- cbind(output, exac_af) %>% 
    select(GENE, AF_EXAC, AF_EXAC_AFR, AF_EXAC_AMR, AF_EXAC_EAS, AF_EXAC_EUR, AF_EXAC_SAS, everything())
}

# Import ExAC data for all ACMG
ACMG.exac <- NULL
for (gene in ACMG.panel) {
  #print(sprintf("[%d/%d] %s",which(gene==ACMG.panel),length(ACMG.panel),gene))
  ACMG.exac <- rbind(ACMG.exac,import.file.exac(gene))
}
#Display and remove duplicates
#ACMG.exac[duplicated(ACMG.exac$VAR_ID),1:8]
ACMG.exac <- ACMG.exac[!duplicated(ACMG.exac$VAR_ID),]
cat(sprintf("Processed ExAC VCFs: %s x %s (selected rows/columns):", nrow(ACMG.exac), ncol(ACMG.exac)))
## Processed ExAC VCFs: 58873 x 45 (selected rows/columns):
ACMG.exac[1:5,1:13] %>% format(scientific = F) %>% pander
GENE AF_EXAC AF_EXAC_AFR AF_EXAC_AMR AF_EXAC_EAS AF_EXAC_EUR
APC 0.00008130 0.00000000 0.0000000 0 0.0000000
APC 0.00008131 0.00000000 0.0000000 0 0.0000000
APC 0.11120000 0.07978723 0.1021505 0 0.1063298
APC 0.00008131 0.00000000 0.0000000 0 0.0000000
APC 0.00008134 0.00000000 0.0000000 0 0.0000000

Table continues below

AF_EXAC_SAS VAR_ID CHROM POS ID REF ALT
0.0001313370 5_112043365_G_C 5 112043365 . G C
0.0001313025 5_112043382_A_G 5 112043382 . A G
0.1184659837 5_112043384_T_G 5 112043384 rs78429131 T G
0.0001313025 5_112043392_C_T 5 112043392 . C T
0.0001313025 5_112043412_C_G 5 112043412 . C G
if (rm_rs1805124)
  ACMG.exac <- ACMG.exac[ACMG.exac$ID!="rs1805124",]


\newpage

1.7 Merge ClinVar with 1000 Genomes and ExAC

merge_clinvar_1000g <- function() {
  inter <- intersect(clinvar$VAR_ID[clinvar$INTERP], ACMG.1000g$VAR_ID)
  clinvar_merged <- clinvar[(clinvar$VAR_ID %in% inter),] %>% arrange(VAR_ID)
  ACMG_merged <- ACMG.1000g[ACMG.1000g$VAR_ID %in% inter,] %>% arrange(VAR_ID)
  front_cols <- 1:(grep("HG00096",colnames(ACMG.1000g))-1)
  cbind(ACMG_merged[,c("GENE","AF_1000G")], clinvar_merged,
                  ACMG_merged[,-front_cols])
}
merged_1000g <- merge_clinvar_1000g()

inter <- intersect(clinvar$VAR_ID[clinvar$INTERP], ACMG.exac$VAR_ID)
merged_exac <- cbind(clinvar[(clinvar$VAR_ID %in% inter),] %>% arrange(VAR_ID), 
  ACMG.exac %>% select(VAR_ID, contains("AF_"), GENE) %>% 
    filter(VAR_ID %in% inter) %>% arrange(VAR_ID) %>% select(-VAR_ID)
  ) %>% select(VAR_ID, GENE, AF_EXAC, contains("AF_"), everything())

#count up all pathogenic ClinVar in ACMG regions
is.acmg <- function(row) {
  genes <- which(row$CHROM == download_output$chrom)
  sapply(genes, function(gene) {
    between(row$POS, download_output[gene,]$start, download_output[gene,]$end)
  }) %>% any
}
cat("Breakdown of ClinVar Variants")
## Breakdown of ClinVar Variants
data.frame(Subset_ClinVar = c("Total ClinVar","LP/P-ClinVar","LP/P-ClinVar & ACMG",
             "LP/P-ClinVar & ACMG & ExAC","LP/P-ClinVar & ACMG & 1000 Genomes"),
           Number_of_Variants = c(nrow(clinvar), 
                                  sum(clinvar$INTERP), 
                                  sum(apply(clinvar[clinvar$INTERP,], 1, is.acmg)), 
                                  nrow(merged_exac), 
                                  nrow(merged_1000g))) %>% pander
Subset_ClinVar Number_of_Variants
Total ClinVar 126349
LP/P-ClinVar 33033
LP/P-ClinVar & ACMG 6252
LP/P-ClinVar & ACMG & ExAC 826
LP/P-ClinVar & ACMG & 1000 Genomes 122
cat("Breakdown of ACMG-1000 Genomes Variants")
## Breakdown of ACMG-1000 Genomes Variants
data.frame(Subset_1000_Genomes = c("Total 1000_Genomes & ACMG", "1000_Genomes & ACMG & ClinVar",
                                   "1000_Genomes & ACMG & LP/P-ClinVar"), 
           Number_of_Variants = c(nrow(ACMG.1000g),
                                  intersect(clinvar$VAR_ID, ACMG.1000g$VAR_ID) %>% length, 
                                  nrow(merged_1000g))) %>% pander
Subset_1000_Genomes Number_of_Variants
Total 1000_Genomes & ACMG 139335
1000_Genomes & ACMG & ClinVar 4891
1000_Genomes & ACMG & LP/P-ClinVar 122
cat("Breakdown of ACMG-ExAC Variants")
## Breakdown of ACMG-ExAC Variants
data.frame(Subset_ExAC = c("Total ExAC & ACMG","ExAC & ACMG & ClinVar","ExAC & ACMG & LP/P-ClinVar"),
          Number_of_Variants = c(nrow(ACMG.exac),
                                 intersect(clinvar$VAR_ID, ACMG.exac$VAR_ID) %>% length, 
                                 nrow(merged_exac))) %>% pander
Subset_ExAC Number_of_Variants
Total ExAC & ACMG 58873
ExAC & ACMG & ClinVar 10043
ExAC & ACMG & LP/P-ClinVar 826


\newpage

1.8 Comparison with ClinVar Browser Query Results

clinvar_query.txt contains all results matched by the search query: “(APC[GENE] OR MYH11[GENE]… OR WT1[GENE]) AND (clinsig_pathogenic[prop] OR clinsig_likely_pathogenic[prop])” from the ClinVar website. The exact query is saved in /Supplementary_Files/query_input.txt


This presents another way of collecting data from ClinVar.

Intermediate step: convert hg38 locations to hg19 using the Batch Coordinate Conversion tool (liftOver) from UCSC Genome Browser Utilities.

clinvar_query <- read.table(file = "Supplementary_Files/clinvar_query_2016-10-25.txt", sep = "\t", header = F, skip = 1, stringsAsFactors = F)
colnames(clinvar_query) <- c("Name", "Gene(s)", "Condition(s)", "Frequency", "Clinical significance (Last reviewed)","Review status", "Chromosome","Location","Assembly","VariationID","AlleleID(s)")
#cat(sprintf("Initial count: %s variants", nrow(clinvar_query)))
clinvar_count <- rep(0,7)
clinvar_count[1] <- nrow(clinvar_query)
clinvar_query <- clinvar_query[grepl(".>.",clinvar_query$Name),]
#cat(sprintf("After filtering for substitutions (N>N'): %s variants", nrow(clinvar_query)))
clinvar_count[2] <- nrow(clinvar_query)
clinvar_query <- clinvar_query[!grepl(" - ", clinvar_query$Location) & 
                              !grepl("|",clinvar_query$Chromosome, fixed = T) &
                              !(clinvar_query$Location == ""),]
#cat(sprintf("After filtering for coupling/bad-locations: %s variants (FINAL)", nrow(clinvar_query)))
clinvar_count[3] <- nrow(clinvar_query)
clinvar_query <- clinvar_query %>%
  mutate(Name = sub(".*(.>.).*","\\1", clinvar_query$Name)) %>% 
  mutate(Location = as.integer(Location)) %>%
  separate(Name, into = c("Alternate","Reference"), sep = ">")
#liftOver from http://hgdownload.soe.ucsc.edu/admin/exe/macOSX.x86_64/liftOverMerge
#input data from http://hgdownload.soe.ucsc.edu/goldenPath/hg38/liftOver/
#paste(paste0("chr",clinvar_query$Chromosome), clinvar_query$Location, clinvar_query$Location+1) %>%
#  write.table(file = "Supplementary_Files/cvquery_location.bed", quote = F, row.names = F, col.names = F)
#system("chmod +x ../Tools/liftOver")
#system("../Tools/liftOver Supplementary_Files/cvquery_hg38.bed ../Tools/hg38ToHg19.over.chain.gz Supplementary_Files/cvquery_hg19.bed err.log")
cvquery_hg19 <- read.table(file = "Supplementary_Files/cvquery_hg19.bed", sep = "\t", header = F, stringsAsFactors = F)
clinvar_query <- clinvar_query %>%
  mutate(Location = cvquery_hg19$V2) %>%
  unite(col = "VAR_ID", Chromosome, Location, Reference, Alternate, sep = "_", remove = F)
#cat(sprintf("Overlap with ClinVar VCF: %s variants", sum(clinvar_query$VAR_ID %in% clinvar$VAR_ID)))
clinvar_count[4] <- sum(clinvar_query$VAR_ID %in% clinvar$VAR_ID)
#cat(sprintf("Overlap with LP/P ClinVar VCF: %s variants", sum(clinvar_query$VAR_ID %in% clinvar$VAR_ID[clinvar$INTERP])))
clinvar_count[5] <- sum(clinvar_query$VAR_ID %in% clinvar$VAR_ID[clinvar$INTERP])
#cat(sprintf("Overlap with LP/P ClinVar VCF AND ExAC: %s variants", sum(clinvar_query$VAR_ID %in% merged_exac$VAR_ID)))
clinvar_count[6] <- sum(clinvar_query$VAR_ID %in% merged_exac$VAR_ID)
#merged_exac[merged_exac$VAR_ID %in% clinvar_query$VAR_ID, c(1:3,11,15)][1:3,]
#cat(sprintf("Overlap with LP/P ClinVar VCF AND 1000 Genomes: %s variants", sum(clinvar_query$VAR_ID %in% merged_1000g$VAR_ID)))
clinvar_count[7] <- sum(clinvar_query$VAR_ID %in% merged_1000g$VAR_ID)
#merged_1000g[merged_1000g$VAR_ID %in% clinvar_query$VAR_ID, c(3,1:2,6,10)][1:3,]
clinvar_count[8] <- sum((clinvar_query$VAR_ID %in% merged_exac$VAR_ID) & (clinvar_query$VAR_ID %in% merged_1000g$VAR_ID))
clinvar_query[(clinvar_query$VAR_ID %in% merged_exac$VAR_ID) & (clinvar_query$VAR_ID %in% merged_1000g$VAR_ID),] -> temp
cat(sprintf("ClinVar Query Results Table (substitutions only): %s x %s (selected rows/columns)", 
            nrow(clinvar_query), ncol(clinvar_query)))
## ClinVar Query Results Table (substitutions only): 6714 x 13 (selected rows/columns)
filter(clinvar_query, 
    (clinvar_query$VAR_ID %in% merged_exac$VAR_ID) &
    (clinvar_query$VAR_ID %in% merged_1000g$VAR_ID)) %>% select(c(1,4,5,6)) %>%
  mutate(`Condition(s)` = `Condition(s)` %>% strsplit("|", fixed = T) %>% sapply("[",1)) %>%
  mutate(Frequency = Frequency %>% strsplit(", ") %>% sapply("[",1) ) %>% 
  mutate(`Gene(s)` = `Gene(s)` %>% strsplit("|", fixed = T) %>% sapply("[",1) ) %>% pander
VAR_ID Gene(s) Condition(s) Frequency
X_100652891_C_G GLA Fabry disease GMAF:0.00050(G)
11_47374186_C_G MYBPC3 Primary familial hypertrophic cardiomyopathy GMAF:0.00020(G)
11_47355233_C_G MYBPC3 Familial hypertrophic cardiomyopathy 4 GMAF:0.00020(G)
11_47364162_C_G MYBPC3 Familial hypertrophic cardiomyopathy 4 GMAF:0.00020(G)
14_23886482_G_C MYH7 not specified GMAF:0.00020(C)
14_23893148_C_G MYH7 Primary dilated cardiomyopathy GO-ESP:0.00046(G)
1_17355075_A_T SDHB Gastrointestinal stromal tumor GMAF:0.00120(T)
1_17380507_G_C SDHB Cowden syndrome 2 GO-ESP:0.01323(C)
cat("Breakdown of ClinVar Query Results Table: ")
## Breakdown of ClinVar Query Results Table:
data.frame(Subset = c("Initial Count","Filter Substitutions (N>N')","Filter Coupling/Bad-Locations","In ClinVar VCF","In LP/P-ClinVar VCF","^ & ACMG & ExAC","^ & ACMG & 1000 Genomes", "^ & ACMG & ExAC & 1000 Genomes"), Number_of_Variants = clinvar_count) %>% pander
Subset Number_of_Variants
Initial Count 12525
Filter Substitutions (N>N’) 6732
Filter Coupling/Bad-Locations 6714
In ClinVar VCF 509
In LP/P-ClinVar VCF 503
^ & ACMG & ExAC 49
^ & ACMG & 1000 Genomes 9
^ & ACMG & ExAC & 1000 Genomes 8
cat("Note the 12-fold reduction after merging the online query results with the VCF.")
## Note the 12-fold reduction after merging the online query results with the VCF.



\newpage

2 Plot Summary Statistics Across Populations

### For plotting population level data:
prettyprint <- function(values, sd, title, xlabel, ylabel, ylimit) {
  if (missing(sd)) sd <- TRUE
  if (missing(title)) title <- NULL
  if (missing(xlabel)) xlabel <- "Population"
  if (missing(ylabel)) ylabel <- NULL
  if (missing(ylimit)) ylimit <- NULL
  colnames(values) <- c("Mean","SD")
  values$Population <- factor(pop.levels, levels = pop.levels)
  values$Superpopulation <- factor(super[pop.levels], levels = super.levels)
  
  plot.pop <- ggplot(values, aes(x=Population, y=Mean, fill = Superpopulation)) +
    geom_bar(stat = "identity") + ggtitle(title) + xlab(xlabel) + ylab(ylabel) +
    theme_minimal() + theme(axis.text.x = element_text(angle = -45, hjust = 0.4))
  if (sd) {
    if (min(values$Mean - values$SD)<0)
      plot.pop <- plot.pop + geom_errorbar(aes(
        ymin = pmax(0,values$Mean - values$SD), 
        ymax = values$Mean + values$SD, width = 0.5))
    else 
      plot.pop <- plot.pop + geom_errorbar(aes(ymin = values$Mean - values$SD, 
                                               ymax = values$Mean + values$SD, width = 0.5))
  } else {values$SD = 0}
  if (length(ylimit)==2)
    plot.pop <- plot.pop + ylim(ylimit[1],ylimit[2])
  else
    plot.pop <- plot.pop + ylim(0, 1.1*max(values$Mean + values$SD))
  plot.pop
}


2.1 Distribution of Allele Frequencies


plot_af_distrib <- function() {
  af_distrib <- data.frame(Index = 2:max(nrow(merged_1000g),nrow(merged_exac))-1,
    AF_1000G = sort(merged_1000g$AF_1000G[merged_1000g$AF_1000G != max(merged_1000g$AF_1000G)],
                    decreasing = T) %>% c(rep(NA,nrow(merged_exac)-nrow(merged_1000g))), 
    AF_EXAC = sort(merged_exac$AF_EXAC[merged_exac$AF_EXAC != max(merged_exac$AF_EXAC)], 
                   decreasing = T)) %>%
    gather(Dataset, Allele_Frequency, AF_1000G, AF_EXAC) %>%
    filter(!is.na(Allele_Frequency))
  ggplot(aes(x = Allele_Frequency, color = Dataset), data = af_distrib) + 
    geom_density() + facet_grid(Dataset ~ .) + xlab("Allele Frequency") + ylab("Density") +
    theme(legend.position="none")
}
plot_af_distrib()

#Test of Poissonness
x = table(merged_1000g$AF_1000G)
k = 1:length(x)-1
poissondata = data.frame(k=k, poissonness = as.vector(log(x)+lfactorial(k)))

The distribution of allele frequencies is approximately Poisson, with “Poissonness plot” correlation = 0.99. The Poissonness plot (Hoaglin 1980) is defined as the plot of \(log(x_k) + log(k!)\) vs. \(k\), as shown below:

ggplot(aes(x=k,y=poissonness), data = poissondata) + geom_point() + ylab("log(x_k) + log(k!)") + ggtitle("Poissonness Plot")

\newpage

2.2 Overall Non-Reference Sites

2.2.0.1 For 1000 Genomes

Each individual has \(n\) non-reference sites, which can be found by counting. The mean number is computed for each population.

Ex: the genotype of 3 variants in 3 people looks like this:

example <- ACMG.1000g[c(88,95,96),14:16]
rownames(example) <- c("Variant 1", "Variant 2","Variant 3")
example %>% pander
  HG00097 HG00099 HG00100
Variant 1 0 2 1
Variant 2 0 0 1
Variant 3 0 0 1

Count the number of non-reference sites per individual:

colSums(example>0) %>% pander
HG00097 HG00099 HG00100
0 1 3
cat(sprintf("Mean = %s", mean(colSums(example>0)) %>% signif(3)))
## Mean = 1.33
### 1000 Genomes
front_cols <- 1:(grep("HG00096",colnames(ACMG.1000g))-1)
sapply(pop.levels, function(pop) {
  #Counts the number of non-reference sites in a gene
  temp <- colSums(ACMG.1000g[,c(front_cols,map$pop)==pop]>0)
  c(mean(temp), sd(temp))
}) %>% t %>% tbl_df -> values #Number of non-reference sites across the different populations
colnames(values) <- c("Mean","SD")
values$Population <- factor(pop.levels, levels = pop.levels)
values$Superpopulation <- factor(super[pop.levels], levels = super.levels)
prettyprint(values, title = "ACMG-56: Mean in 1000 Genomes", sd = T, ylimit = NULL, 
            xlabel = "Population", ylabel = "Mean No. of Non-Reference Sites")


Note: the error bars denote standard deviation, not standard error.

\newpage

2.2.0.2 For ExAC

The mean number of non-reference sites is \(E(V)\), where \(V = \sum_{i=1}^n v_i\) is the number of non-reference sites at all variant positions \(v_1\) through \(v_n\).

At each variant site, the probability of having at least 1 non-reference allele is \(P(v_i) = P(v_{i,a} \cup v_{i,b})\), where \(a\) and \(b\) indicate the 1st and 2nd allele at each site.

If the two alleles are independent, \(P(v_{i,a} \cup v_{i,b})\) = \(1-(1-P(v_{i,a}))(1-P(v_{i,b})) = 1-(1-AF(v_i))^2\)

If all variants are independent, \(E(V) = \sum_{i=1}^n 1-(1-AF(v_i))^2\) for any set of allele frequencies.


Ex: the allele frequencies of 3 variants across the 5 superpopulations looks like this:

example <- rbind(c(0.1,0.2,0,0,0.3),c(0.2,0,0.3,0,0.1)) %>% as.data.frame
rownames(example) <- c("Variant 1", "Variant 2")
colnames(example) <- super.levels
example %>% pander
  AFR AMR EAS EUR SAS
Variant 1 0.1 0.2 0 0 0.3
Variant 2 0.2 0 0.3 0 0.1

The probability of having at least 1 non-reference site at each variant - (0|1) (1|0) or (1|1) is given by \(1-(1-AF)^2\). Note that this is approximately \(2*AF\) when \(AF\) is small:

as.data.frame(1-(1-example)^2) %>% pander
  AFR AMR EAS EUR SAS
Variant 1 0.19 0.36 0 0 0.51
Variant 2 0.36 0 0.51 0 0.19

By linearity of expectation, the expected (mean) number of non-reference sites is \(\sum E(V_i) = \sum (columns)\).

colSums(1-(1-example)^2) %>% pander
AFR AMR EAS EUR SAS
0.55 0.36 0.51 0 0.7
### ExAC
#Each element is the probability that at least 1 of the 2 alleles are non-reference.
exac_prob <- 1-(1-ACMG.exac[,sprintf("AF_EXAC_%s",super.levels)])^2
exac_values <- data.frame(exac_prob %>% colSums(na.rm = T), super.levels)
colnames(exac_values) = c("values","Superpopulation")
ggplot(exac_values, aes(x = Superpopulation, y=values, fill = Superpopulation)) + 
  geom_bar(stat = "identity") + theme_minimal() + ggtitle("ACMG-56: Mean in ExAC") + 
  xlab("Population") + ylab("Mean No. of Non-Reference Sites") + 
  ylim(0,1.1*max(exac_values$values))


\newpage

2.3 Pathogenic Non-Reference Sites

2.3.0.1 For 1000 Genomes and ExAC

This is the same procedure as above, but performed only on the subset of variants that are pathogenic.

front_cols <- 1:(grep("HG00096",colnames(merged_1000g))-1)
### 1000 Genomes
sapply(pop.levels, function(pop) {
  #Counts the number of non-reference sites in a gene
  temp <- colSums(merged_1000g[,c(front_cols,map$pop)==pop]>0)
  c(mean(temp), sd(temp))
}) %>% t %>% tbl_df -> values #Number of non-reference sites across the different populations
colnames(values) <- c("Mean","SD")
values$Population <- factor(pop.levels, levels = pop.levels)
values$Superpopulation <- factor(super[pop.levels], levels = super.levels)
prettyprint(values, title = "ACMG-56 Pathogenic: Mean in 1000 Genomes", sd = T, ylimit = NULL, 
            xlabel = "Population", ylabel = "Mean No. of Non-Reference Sites")

### ExAC
#Each element is the probability that at least 1 of the 2 alleles are non-reference.
exac_prob <- (1-(1-merged_exac[,sprintf("AF_EXAC_%s",super.levels)])^2)
exac_values <- data.frame(exac_prob %>% colSums(na.rm = T), super.levels)
colnames(exac_values) = c("values","Superpopulation")
ggplot(exac_values, aes(x = Superpopulation, y=values, fill = Superpopulation)) + 
  geom_bar(stat = "identity") + ggtitle("ACMG-56 Pathogenic: Mean in ExAC") + 
  xlab("Population") + ylab("Mean No. of Non-Reference Sites") + theme_minimal() + 
  ylim(0,1.1*max(exac_values$values))


2.4 Fraction of Individuals with Pathogenic Sites

2.4.0.1 For 1000 Genomes

We can count up the fraction of individuals with 1+ non-reference site(s) in each population. This is the fraction of individuals who would receive a positive genetic test result in at least 1 of the ACMG-56 genes.

Ex: the genotype of 3 variants in 3 people looks like this:

example <- ACMG.1000g[c(88,95,96),14:16]
rownames(example) <- c("Variant 1", "Variant 2","Variant 3")
example %>% pander
  HG00097 HG00099 HG00100
Variant 1 0 2 1
Variant 2 0 0 1
Variant 3 0 0 1

Count each individual as having a non-reference site (1) or having only reference sites (0):

(1*(colSums(example>0)>0)) %>% pander
HG00097 HG00099 HG00100
0 1 1
cat(sprintf("Mean = %s", mean(1*(colSums(example>0)>0)) %>% signif(3)))
## Mean = 0.667
front_cols <- 1:(grep("HG00096",colnames(merged_1000g))-1)
### 1000 Genomes
sapply(pop.levels, function(pop) {
  #Counts the number of non-reference sites in a gene
  keep = length(front_cols)+which(map$pop == pop)
  temp <- colSums(merged_1000g[,keep])>0
  c(mean(temp),sd(temp))
}) %>% t %>% tbl_df -> values #Number of non-reference sites across the different populations
colnames(values) <- c("Mean","SD")
values$Population <- factor(pop.levels, levels = pop.levels)
values$Superpopulation <- factor(super[pop.levels], levels = super.levels)
prettyprint(values, title = "ACMG-56 Pathogenic: 1000 Genomes Fraction", sd = F, ylimit = NULL, 
            xlabel = "Population", ylabel = "Fraction with at least 1 non-reference site")

\newpage

2.4.0.2 For ExAC

The probability of having at least 1 non-reference site is \(P(X)\), where \(X\) indicates a non-reference site at any variant position \(v_1\) through \(v_n\).

Recall that \(P(v_i) = P(v_{i,a} \cup v_{i,b}) = 1-(1-AF(v))^2\) when alleles are independent.

If all alleles are independent, \(P(X) = P(\bigcup_{i=1}^n v_i) = 1-\prod_{i=1}^n (1-AF(v_i))^2\)

Ex: the allele frequencies of 3 variants across the 5 superpopulations looks like this:

example <- rbind(c(0.1,0.2,0,0,0.3),c(0.2,0,0.3,0,0.1)) %>% as.data.frame
rownames(example) <- c("Variant 1", "Variant 2")
colnames(example) <- super.levels
example %>% pander
  AFR AMR EAS EUR SAS
Variant 1 0.1 0.2 0 0 0.3
Variant 2 0.2 0 0.3 0 0.1

The probability of having at least 1 non-reference site at each variant - (0|1) (1|0) or (1|1) is given by \(1-(1-AF)^2\).
Note that this is approximately \(2*AF\) when \(AF\) is small:

as.data.frame(1-(1-example)^2) %>% pander
  AFR AMR EAS EUR SAS
Variant 1 0.19 0.36 0 0 0.51
Variant 2 0.36 0 0.51 0 0.19

The expected (mean) number of non-reference sites is given by \(1-\prod (1-AF)^2\).

apply(example, 2, function(x) 1-prod((1-x)^2)) %>% pander
AFR AMR EAS EUR SAS
0.4816 0.36 0.51 0 0.6031
### ExAC
#Each element is the probability that at least 1 of the 2 alleles are non-reference.
exac_prob <- (1-(1-merged_exac[,sprintf("AF_EXAC_%s",super.levels)])^2)
exac_values <- data.frame(1-apply(1-exac_prob, 2, prod, na.rm = T), super.levels)
colnames(exac_values) = c("values","Superpopulation")
ggplot(exac_values, aes(x = Superpopulation, y=values, fill = Superpopulation)) + 
  geom_bar(stat = "identity") + ggtitle("ACMG-56 Pathogenic: ExAC Fraction") + 
  xlab("Population") + ylab("Fraction with at least 1 non-reference site") + theme_minimal() + 
  ylim(0,1.05*max(exac_values$values))


\newpage

2.5 Test Statistics for Ancestral Differences

F-statistic/T-statistic: probability that the different groups are sampled from distributions with the same mean.
These plots are from 4(a) - 1000 Genomes Fraction with 1+ Non-Reference Site, but can be replicated for plots 2(ab) and 3(ab) as well.

#Calculating test statistics (F-values)
Fcalc <- function(values, pop) {
  if (missing(pop)) {
    groups <- super[pop.levels]
  } else {
    groups <- ifelse(super[pop.levels]==pop,pop,"Other")
  }
  data <- data.frame(y = values, group = factor(groups))
  color_map <- c("red","gold3","springgreen3","deepskyblue","violet","white") %>% 
    setNames(c("AFR","AMR","EAS","EUR","SAS","Other"))
  out <- lm(y ~ group, data) %>% anova
  plot(y ~ group, data, xlab = NULL, ylab = NULL, 
       col = color_map[sort(unique(groups))], 
       main = paste("F-statistic:",out$`Pr(>F)`[1] %>% signif(3)))
  out
}
par(mfrow=c(2,3))
F_values <- c(Fcalc(values$Mean)$`Pr(>F)`[1] %>% setNames("Overall"), 
              sapply(super.levels, function(pop) {
                Fcalc(values$Mean, pop)$`Pr(>F)`[1]
              }))

par(mfrow=c(1,1), mar=c(5, 4, 4, 2)+0.1)
#F_values


\newpage

2.6 Common Pathogenic Variants by Ancestry

### 1000 Genomes
front_cols <- 1:(grep("HG00096",colnames(merged_1000g))-1)
sapply(super.levels, function(superpop) {
  (merged_1000g[,length(front_cols)+which(map$super_pop == superpop)] %>% 
    rowSums %>% setNames(merged_1000g$VAR_ID)) /
    (2*sum(map$super_pop==superpop))
}) -> af_1000g_by_ancestry
ord <- order(apply(af_1000g_by_ancestry,1,sum), decreasing = T)[1:8]
ranked_id <- row.names(af_1000g_by_ancestry)[ord]
ranked_var <- data.frame(Var_ID = factor(ranked_id, levels = ranked_id), 
    af_1000g_by_ancestry[ord,]) %>% gather(Ancestry, Subdivided_Allele_Frequencies, -Var_ID)
ggplot(ranked_var, aes(x = Var_ID, y = Subdivided_Allele_Frequencies, fill = Ancestry)) +
    geom_bar(stat='identity', color = 'black', width = 0.7) + 
    ggtitle("High Frequency Variants in 1000 Genomes") + coord_flip()

### ExAC
af_exac_by_ancestry <- merged_exac[,sprintf("AF_EXAC_%s",super.levels)]
colnames(af_exac_by_ancestry) <- super.levels
ord <- order(apply(af_exac_by_ancestry,1,sum), decreasing = T)[1:8]
ranked_id <- merged_exac$VAR_ID[ord]
ranked_var <- data.frame(Var_ID = factor(ranked_id, levels = ranked_id), 
                         af_exac_by_ancestry[ord,]) %>% 
              gather(Ancestry, Subdivided_Allele_Frequencies, 2:6)
ggplot(ranked_var, aes(x = Var_ID, y = Subdivided_Allele_Frequencies, fill = Ancestry)) +
    geom_bar(stat='identity', color = 'black', width = 0.7) + 
    ggtitle("High Frequency Variants in ExAC") + coord_flip()


3 Penetrance Estimates

3.1 Bayes’ Rule as a Model for Estimating Penetrance

Let \(V_x\) be the event that an individual has 1 or more variant related to disease \(x\),
and \(D_x\) be the event that the individual is later diagnosed with disease \(x\).

In this case, we can define the following probabilities:
1. Prevalence = \(P(D_x)\)
2. Allele Frequency = \(P(V_x)\)
3. Allelic Heterogeneity = \(P(V_x|D_x)\)
4. Penetrance = \(P(D_x|V_x)\)

By Bayes’ Rule, the penetrance of a variant related to disease \(x\) may be defined as: \[P(D_x|V_x) = \frac{P(D_x)*P(V_x|D_x)}{P(V_x)} = \frac{Prevalence*Allelic.Heteogeneity}{Allele.Frequency}\]

To compute penetrance estimates for each of the diseases related to the ACMG-56 genes, we will use the prevalence data we collected into Literature_Prevalence_Estimates.csv, allele frequency data from 1000 Genomes and ExAC, and a broad range of values for allelic heterogeneity.

3.2 Import Literature-Based Disease Prevalence Data

Data Collection: 1. Similar disease subtypes were grouped together (e.g., the 8 different types of familial hypertrophic cardiomyopathy), resulting in 30 disease categories across 56 genes.
2. The search query “[disease name] prevalence” was used to find articles using Google Scholar.
3. Prevalence estimates were recorded along with URL, journal, region, publication year, sample size, first author, population subset (if applicable), date accessed, and potential issues. Preference was given to studies with PubMed IDs, more citations, and larger sample sizes.

Prevalence was recorded as reported: either a point estimate or a range. Values of varying quality were collected across all diseases.

colclass <- rep("character",16)
colclass[c(5:6,10,14)] <- "numeric"
ACMG_Lit <- read.csv(file = "Literature_Prevalence_Estimates.csv", header = TRUE, stringsAsFactors = F, na.strings = "\\N", colClasses = colclass)
disease_abbrev <- sapply(ACMG_Lit$Disease, function(d) {
  strsplit(d," ") %>% sapply("[",ifelse(grepl("Familial",d),2,1)) %>% substr(1,9) %>% paste0("...")
}) %>% setNames(NULL)
disease_abbrev_longer <- sapply(ACMG_Lit$Disease, function(d) {
  out <- substr(d,1,17) 
  split_out <- strsplit(out," ") %>% unlist
  if (nchar(split_out[length(split_out)]) <= 2)
    out <- split_out[-length(split_out)] %>% paste(collapse = " ")
  out %>% paste0("...")
}) %>% setNames(NULL)
combined <- (ACMG_Lit$Inverse.Prevalence.1+ACMG_Lit$Inverse.Prevalence.2)/2
mean.prev <- ACMG_Lit$Inverse.Prevalence.1 %>% setNames(disease_abbrev_longer)
mean.prev[!is.na(combined)] <- combined[!is.na(combined)]
cat(sprintf("Table of Literature-Based Estimates of Disease Prevalence %s x %s (selected rows/columns):", nrow(ACMG_Lit), ncol(ACMG_Lit)))
## Table of Literature-Based Estimates of Disease Prevalence 30 x 16 (selected rows/columns):
ACMG_Lit[c(4,5,8,14),] %>% remove_rownames %>% 
  select(Gene, Disease, Disease_MIM, Tags, Inverse.Prevalence.1, Inverse.Prevalence.2, year,first.author,citations) %>% pander
Gene Disease Disease_MIM Tags
BRCA1;BRCA2 Breast-ovarian cancer familial 604370;612555 breast;ovarian
SCN5A Brugada syndrome 601144 brugada
COL3A1 Ehlers-Danlos syndrome 130050 ehler;danlos
TP53 Li-Fraumeni syndrome 151623 fraumeni

Table continues below

Inverse.Prevalence.1 Inverse.Prevalence.2 year first.author citations
104 NA 2013 NA NA
10000 2000 2006 Antzelevitch 11
20000 NA 2010 Malfait 116
20000 5000 1999 Schneider 47


3.3 Distribution of Prevalences

data.frame(Disease = disease_abbrev_longer, 
         Inverse_Prevalence = mean.prev %>% setNames(NULL)) %>%
ggplot(aes(x = factor(Disease, levels = Disease[order(Inverse_Prevalence)]), y = Inverse_Prevalence)) + 
  geom_point(stat = 'identity') + coord_flip() + xlab("Disease") + 
  scale_y_continuous(trans='log10', breaks = 10^(0:6), 
    labels = sapply(0:6, function(x) paste0("1",paste0(rep("0",x), collapse = "")))) +
  theme(axis.text.y=element_text(size=7)) + 
  ggtitle("Distribution of Inverse Prevalences (log-scale)") + ylab("Inverse Prevalence")

\newpage

3.4 Collect and Aggregate Allele Frequencies at the Disease-Level

We define AF(disease) as the probability of having at least 1 variant associated with the disease.
The frequencies across the relevant variants can be aggregated in two ways:
(1) By direct counting, from genotype data in 1000 Genomes.
(2) AF(disease) = \(1-\prod_{variant}(1-AF_{variant})\), from population data in ExAC (assumes independence).

if (!("AF_1000G_AFR" %in% colnames(merged_1000g))) {
  front_cols <- 1:(grep("HG00096",colnames(merged_1000g))-1)
  sapply(super.levels, function(superpop){
    (merged_1000g[,length(front_cols)+which(map$super_pop == superpop)] %>% rowSums)/(2*2504)
  }) -> pop_af
  colnames(pop_af) <- sprintf("AF_1000G_%s",super.levels)
  merged_1000g <- data.frame(merged_1000g, pop_af) %>% 
    select(GENE, AF_1000G, VAR_ID, CHROM, POS, ID, REF, ALT, CLNALLE, CLNSIG, 
           AF_1000G_AFR, AF_1000G_AMR, AF_1000G_EAS, AF_1000G_EUR, AF_1000G_SAS, everything())
}


front_cols <- 1:(grep("HG00096",colnames(merged_1000g))-1)
getAlleleFreq <- function(input, ind, method, dataset) {
  if (method == "MIM") {
    search_list <- ACMG_Lit$Disease_MIM
    search_in <- "CLNDSDBID"
  }
  if (method == "Tags") {
    search_list <- ACMG_Lit$Tags
    search_in <- "CLNDBN"
  }
  if (method == "Gene") {
    search_list <- ACMG_Lit$Gene
    search_in <- "GENE"
  }
  output <- sapply(search_list, function(item.vec) {
    item.vec.split <- strsplit(item.vec,";") %>% unlist
    loc <- rep(FALSE, nrow(input)) #loc = locations of all the "hits"
    for(item in item.vec.split)
      loc <- loc | grepl(item,input[,search_in], ignore.case = T)
    hits <- sum(loc)
    sapply(c(dataset,super.levels), function(superpop) {
      if (ind) {
        find = sprintf("AF_%s",toupper(dataset))
        if (superpop!=dataset)
          find = paste(find, superpop, sep = "_")
        # Aggregation by calculation + ind assumption
        freq <- input[loc,find] %>% unlist %>% as.numeric #vector of all allele frequencies
        freq <- 1-(1-freq)^2 #probability of having a non-reference site at EITHER allele. 
        final <- 1-prod(1-freq[!is.na(freq)])
      } else {
        # Aggregation by counting
        front_cols <- 1:(grep("HG00096",colnames(input))-1)
        if (superpop == dataset)
          catch <- -front_cols
        else
          catch <- length(front_cols)+which(map$super_pop==superpop)
        final <- mean(colSums( input[loc, catch] )>0)
      }
      final
    }) -> final_list
    col_head <- paste("AF",toupper(dataset),sep = "_")
    c(final_list, hits) %>% setNames(c(col_head, sprintf("%s_%s", col_head, super.levels), "Hits"))
  }) %>% t %>% tbl_df
  
  data.frame(search_list, output)
}
# Other methods MIM and Tags
# Do NOT use MIM if CLNDSDBID is missing (older VCFs)
freq_1000g.count.gene <- getAlleleFreq(merged_1000g, ind = F, method = "Gene", dataset = "1000G")
freq_1000g.calc.gene <- getAlleleFreq(merged_1000g, ind = T, method = "Gene", dataset = "1000G")
freq_exac.calc.gene <- getAlleleFreq(merged_exac, ind = T, method = "Gene", dataset = "EXAC")
allele.freq <- data.frame(
                   COUNT_1000G = freq_1000g.count.gene$AF_1000G, 
                   CALC_1000G = freq_1000g.calc.gene$AF_1000G, 
                   CALC_EXAC = freq_exac.calc.gene$AF_EXAC
  )
row.names(allele.freq) <- ACMG_Lit$Disease


#cor(allele.freq) %>% as.data.frame %>% pander
par(mfrow=c(1,2), mar=c(5, 5.5, 4, 2)+0.1)
ggplot(aes(x = CALC_1000G, y = CALC_EXAC), data = allele.freq) + 
  geom_point(stat = "identity", col = 'red') + ggtitle("Scatterplot: ExAC v. 1000 Genomes") + 
  geom_text_repel(aes(label = disease_abbrev_longer), size = 3) + 
  scale_x_continuous(limits = c(10^-5, max(allele.freq[,"CALC_EXAC"]))*5, 
                     trans='log10', breaks = 10^-(0:3)) + 
  scale_y_continuous(limits = c(10^-5, max(allele.freq[,"CALC_EXAC"]))*5, 
                     trans='log10', breaks = 10^-(0:3)) + 
  xlab("Allele Frequency (1000 Genomes)") + ylab("Allele Frequency (ExAC)") +
  geom_abline(slope = 1, intercept = 0)

\newpage

Ratio_1000G (red, top) computes AF(calculation in 1000 Genomes) / AF(counting in 1000 Genomes).
Ratio_ExAC (blue, bottom) computes AF(calculation in ExAC) / AF(counting in 1000 Genomes).

ratio_diff <- function() {
  ratio <- data.frame(Means = allele.freq$COUNT_1000G, 
                      Ratio_1000G = (pmax(allele.freq$CALC_1000G, allele.freq$COUNT_1000G)/
                                      pmin(allele.freq$CALC_1000G, allele.freq$COUNT_1000G)), 
                      Ratio_ExAC = (pmax(allele.freq$CALC_EXAC, allele.freq$CALC_1000G)/
                                     pmin(allele.freq$CALC_EXAC, allele.freq$CALC_1000G))) %>%
    mutate(Disease = factor(disease_abbrev_longer, 
           levels = disease_abbrev_longer[order(pmax(Ratio_ExAC,Ratio_1000G))])) %>% 
    gather(Dataset, Ratio, Ratio_1000G, Ratio_ExAC) %>%
    filter(is.finite(Ratio))
  ggplot(aes(x=Disease, y=Ratio, color = Dataset), data = ratio) + coord_flip() + 
    geom_point(stat = 'identity') + facet_wrap("Dataset", ncol = 1) + 
    ggtitle("Ratios of Allele Frequencies from Different Methods") + 
    scale_y_continuous(breaks = seq(0,100,1)) + theme(legend.position="none")
}
ratio_diff()


\newpage

Sampling 1000 variants from all variants in 1000 Genomes to test deviations from independence assumptions.
Repeat for 1000 trials and plot the distribution of disease-level allele frequencies (1000 points per disease).

plot_ind_test <- function() {
  load("Supplementary_Files/ind_test.RData")
  #set.seed(123)
  #do.call("rbind",lapply(1:1000, function(x) {
  #  select_rows <- sample(nrow(ACMG.1000g),1000)
  #  data.frame(DISEASE = disease_abbrev_longer,
  #             COUNT = getAlleleFreq(ACMG.1000g[select_rows,], 
  #               ind = F, method = "Gene", dataset = "1000G")$AF_1000G, 
  #             CALC = getAlleleFreq(ACMG.1000g[select_rows,], 
  #               ind = T, method = "Gene", dataset = "1000G")$AF_1000G)
  #})) -> ind_test_data
  #plot(ind_test_data %>% ggplot(aes(x=CALC-COUNT)) + geom_histogram(bins = 100) + 
  #  xlab("Calculation - Counting") + ylab("Histogram Counts") + facet_wrap("DISEASE", ncol = 3))
  plot(sapply(levels(ind_test_data$DISEASE), function(d) {
    ind_test_data %>% filter(DISEASE == d) %>% mutate(DIFF = CALC - COUNT) %>% 
      select(DIFF) %>% unlist
  }) %>% data.frame %>% gather(Disease, Data) %>% 
    ggplot(aes(x = Disease, y = Data)) + geom_boxplot() + coord_flip() + 
    ylab("Allele Frequency Difference: Calculation - Counting") + 
    ggtitle("Differences in AF Methods: by Disease"))
  cat("30 diseases x 1000 points = 30,000 points. This plot has been downsampled 10x and contains 3,000 points.")
  plot(ggplot(aes(x = COUNT, y = CALC), data = ind_test_data[runif(nrow(ind_test_data)) < 0.1,]) +
         geom_point() + 
         scale_x_continuous(limits = c(0,1), breaks = seq(0,1,0.1)) + 
         scale_y_continuous(limits = c(0,1), breaks = seq(0,1,0.1)) + 
         xlab("Allele Frequency (From Counting)") + ylab("Allele Frequency (From Calculation)") +
         ggtitle("Testing Independence with Random Sampling") + 
         geom_abline(slope = 1, intercept = 0, color = 'blue') +
         geom_text(aes(label = replace(DISEASE,which(abs(COUNT - CALC) < 0.4), NA)), 
                   check_overlap = F, hjust = 1, na.rm = T, size = 2.5)
       )
  ind_test_data %>% filter(COUNT>0)
}
plot_ind_out <- plot_ind_test()

## 30 diseases x 1000 points = 30,000 points. This plot has been downsampled 10x and contains 3,000 points.

Pearson correlation: 0.99
Mean ratio (Calculation/Counting): 1.07

\newpage

3.5 Penetrance as a Function of P(V|D)

The left end of the boxplot indicates P(V|D) = 0.001,
the bold line in the middle indicates P(V|D) = 0.25,
the right end of the boxplot indicates P(V|D) = 0.5.

get_penetrance <- function(ah_low, ah_high, dataset) {
  # Map of disease name to disease tags
  if (dataset == "1000 Genomes")
    named.freqs <- allele.freq$COUNT_1000G %>% setNames(disease_abbrev_longer)
  if (dataset == "ExAC")
    named.freqs <- allele.freq$CALC_EXAC %>% setNames(disease_abbrev_longer)
  named.prev <- 1/mean.prev %>% setNames(disease_abbrev_longer)
  # Repeats allow for correct quartile calculations
  #point estimate set to arithmetic mean
  allelic.het <- c(ah_low, ah_low, mean(c(ah_low, ah_high)) %>% signif(3), ah_high, ah_high)
  #Functions to transform data points with disease_af = 0
  set_to_na <- function(row) { replace(row, is.infinite(row),NA) %>% pmin(1)}
  # Matrix of penetrance values for allelic het range, capped at 1
  penetrance <- as.vector(allelic.het %o% (named.prev/named.freqs)) %>% set_to_na()
  order(matrix(penetrance,nrow = 5) %>% colSums, decreasing = T) -> ord
  # replicate each element n times to create labels
  penetrance_data <- data.frame("Penetrance" = penetrance, "Disease" =
    factor(sapply(disease_abbrev_longer, function(x) rep(x,length(allelic.het))) %>% as.vector,
    levels = disease_abbrev_longer[ord]) )
  colormap <- rep('black', length(disease_abbrev))
  colormap[1:(mean(is.na(penetrance_data$Penetrance))*length(colormap))] <- 'gray60'
  colormap <- rev(colormap)
  plot(
    ggplot(aes(x=Disease, y=Penetrance), data = penetrance_data) + 
    geom_boxplot(position = 'identity', coef = 0, na.rm = T) + coord_flip() + 
      ggtitle(dataset) + theme(axis.text.y = element_text(color = colormap)) 
      #annotate("text", x = which(colormap == 'red'), y = 0, 
      #         label = "No Allele Frequency Data", hjust = 0, size = 3)
  )
  penetrance_data
}
pen_1000g <- get_penetrance(ah_low = 0.001, ah_high = 0.5, dataset = "1000 Genomes")

pen_exac <- get_penetrance(ah_low = 0.001, ah_high = 0.5, dataset = "ExAC")


Note 1: the grayed-out empty lines at the top all indicate no allele frequency (disease_AF) data.
Note 2: For breast-ovarian cancer, mean theoretical penetrance > 1. This is because the assumed allelic heterogeneity (0.25) is greater than is possible, given the empirical prevalence and allele frequencies.

\newpage

3.6 Penetrance as a Function of P(D)

doubles <- ACMG_Lit[!is.na(combined),c("Disease","Inverse.Prevalence.1","Inverse.Prevalence.2")]
doubles$Mean.Inverse.Prevalence <- apply(doubles[,-1], 1, function(row) 1/mean(1/row))
data.frame(Disease = doubles$Disease, Prevalence_Ratio = round(doubles$Inverse.Prevalence.1/doubles$Inverse.Prevalence.2,1)) %>% arrange(Prevalence_Ratio) %>% mutate(Prevalence_Ratio = sprintf("%.1f",Prevalence_Ratio)) %>% pander
Disease Prevalence_Ratio
Retinoblastoma 1.3
Marfan’s syndrome 1.5
Lynch syndrome 3.0
Adenomatous polyposis coli 3.3
Li-Fraumeni syndrome 4.0
Paragangliomas 4.0
Pilomatrixoma 4.0
Brugada syndrome 5.0
Peutz-Jeghers syndrome 12.0
Left ventricular noncompaction 18.6

The left end of the boxplot indicates P(D) = upper value,
the bold line in the middle indicates P(D) = mean(values),
the right end of the boxplot indicates P(D) = lower value.

prev.range <- 1/doubles[,c(2,2,4,3,3)] %>% as.matrix
#Set this to 1 rather than 0.02
penetrance_out <- function(allelic.het) {
  set_to_na <- function(row) { replace(row, is.infinite(row),NA) %>% pmin(1)}
  penetrance <- (allelic.het*prev.range/allele.freq$COUNT_1000G[!is.na(combined)]) %>% 
    apply(2, set_to_na)
  rownames(penetrance) <- doubles$Disease
  ord <- penetrance %>% rowSums %>% order(decreasing = T)
  penetrance <- penetrance[ord,]
  colnames(penetrance) <- NULL
  # replicate each element n times to create labels
  penetrance_data <- data.frame("Penetrance" = penetrance %>% t %>% as.vector,
    "Disease" = factor(sapply(row.names(penetrance), function(x) rep(x,5)) %>% as.vector,
    levels = row.names(penetrance))
  )
  colormap <- rep('black', nrow(doubles))
  colormap[1:(mean(is.na(penetrance_data$Penetrance))*length(colormap))] <- 'gray60'
  colormap <- rev(colormap)
  plot(ggplot(aes(x=Disease, y=Penetrance), data = penetrance_data) + 
    geom_boxplot(position = 'identity', coef = 0, na.rm = T) + coord_flip() + 
    ggtitle(sprintf("1000 Genomes: P(V|D) = %s", allelic.het)) + 
    theme(axis.text.y = element_text(color = colormap)) 
  )
  penetrance_data
}
pen_ah_0.1 <- penetrance_out(0.1)

pen_ah_1.0 <- penetrance_out(1.0)

This can only be computed in the 9 cases where a prevalence range was given (rather than a point estimate)
and the disease-level allele frequency is known (in this plot: all of them except Puetz-Jeghers).

3.7 Max/Min Penetrance as a Function of P(D) and P(V|D)

The left end of the boxplot indicates P(D) AND P(V|D) = lower value,
the bold line in the middle indicates P(D) AND P(V|D) = mean(values),
the right end of the boxplot indicates P(D) AND P(V|D) = upper value.

get_penetrance <- function(ah_low, ah_high, dataset, range) {
  # Map of disease name to disease tags
  if (dataset == "1000 Genomes")
    named.freqs <- allele.freq$COUNT_1000G %>% setNames(disease_abbrev_longer)
  if (dataset == "ExAC")
    named.freqs <- allele.freq$CALC_EXAC %>% setNames(disease_abbrev_longer)
  allelic.het <- c(ah_low, ah_low, mean(c(ah_low, ah_high)) %>% signif(3), ah_high, ah_high)
  inv.prev_1 <- 1/ACMG_Lit$Inverse.Prevalence.1 %>% setNames(disease_abbrev_longer)
  inv.prev_2 <- 1/ACMG_Lit$Inverse.Prevalence.2 %>% setNames(disease_abbrev_longer)
  #Adjust ranges of point values to 5, with arithmetic mean = original value
  #take unique prev_1 values and compute a = 2k/(1+r) = lower value of a 5x range
  inv.prev_1[is.na(inv.prev_2)] <- inv.prev_1[is.na(inv.prev_2)]*2/(1+range)
  #take NA prev_2 values and set as 5a. 
  inv.prev_2[is.na(inv.prev_2)] <- inv.prev_1[is.na(inv.prev_2)]*5
  prev_final <- data.frame(inv.prev_1, inv.prev_1, (inv.prev_1+inv.prev_2)/2, 
                             inv.prev_2, inv.prev_2)
  cap_at_1 <- function(row) {pmin(row,1)}
  set_to_na <- function(row) { replace(row, is.infinite(row),NA) %>% pmin(1)}
  penetrance <- apply(sweep(prev_final,MARGIN=2,allelic.het,`*`) / named.freqs, 1, set_to_na) %>% as.data.frame %>% unlist
  # Matrix of penetrance values for allelic het range, capped at 1
  order(matrix(penetrance,nrow = 5) %>% colSums, decreasing = T) -> ord
  # replicate each element n times to create labels
  penetrance_data <- data.frame("Penetrance" = penetrance, "Disease" =
    factor(sapply(disease_abbrev_longer, function(x) rep(x,length(allelic.het))) %>% as.vector,
    levels = disease_abbrev_longer[ord]) )
  colormap <- rep('black', length(disease_abbrev))
  colormap[1:(mean(is.na(penetrance_data$Penetrance))*length(colormap))] <- 'gray60'
  colormap <- rev(colormap)
  plot(ggplot(aes(x=Disease, y=Penetrance), data = penetrance_data) + 
    geom_boxplot(position = 'identity', coef = 0, na.rm = T) + coord_flip() + 
      ggtitle(dataset) + theme(axis.text.y = element_text(color = colormap))
  ) 
  penetrance_data
}

pen_1000g_max <- get_penetrance(ah_low = 0.001, ah_high = 0.5, dataset = "1000 Genomes", range = 5)

pen_exac_max <- get_penetrance(ah_low = 0.001, ah_high = 0.5, dataset = "ExAC", range = 5)

Note: Prevalence ranges of 5x were assumed for all point estimates of prevalence.
For example: a point estimate of 0.3 would be given the range [0.1, 0.5].

3.8 Penetrance Estimates by Ancestry

ancestry_penetrance <- function(ah_low, ah_high, dataset, range) {
  allelic.het <- c(ah_low, ah_low, mean(c(ah_low, ah_high)) %>% signif(3), ah_high, ah_high)
  sapply(c("Total",super.levels), function(superpop){
    # Map of disease name to disease tags
    if (dataset == "1000 Genomes") {
      find <- "AF_1000G"
      named.freqs <- freq_1000g.count.gene
    }
    if (dataset == "ExAC") {
      find <- "AF_EXAC"
      named.freqs <- freq_exac.calc.gene
    }
    if (superpop != "Total") 
      find <- paste(find, superpop, sep = "_")
    named.freqs <- named.freqs[,find] %>% unlist %>% setNames(disease_abbrev_longer)
    inv.prev_1 <- 1/ACMG_Lit$Inverse.Prevalence.1 %>% setNames(disease_abbrev_longer)
    inv.prev_2 <- 1/ACMG_Lit$Inverse.Prevalence.2 %>% setNames(disease_abbrev_longer)
    #Adjust ranges of point values to 5, with arithmetic mean = original value 
    inv.prev_1[is.na(inv.prev_2)] <- inv.prev_1[is.na(inv.prev_2)]*2/(1+range)
    inv.prev_2[is.na(inv.prev_2)] <- inv.prev_1[is.na(inv.prev_2)]*5
    prev_final <- data.frame(inv.prev_1, inv.prev_1, (inv.prev_1+inv.prev_2)/2, 
                             inv.prev_2, inv.prev_2)
    # Matrix of penetrance values for allelic het range, capped at 1
    cap_at_1 <- function(row) {pmin(row,1)}
    set_to_na <- function(row) { replace(row, is.infinite(row),NA) %>% pmin(1)}
    (sweep(prev_final,MARGIN=2,allelic.het,`*`) / named.freqs) %>% 
      apply(1, set_to_na) %>% as.data.frame %>% unlist
  }) %>% as.data.frame -> penetrance_init
  order(matrix(penetrance_init$Total, nrow = 5) %>% colSums, decreasing = T) -> ord
  penetrance_data <- data.frame(penetrance_init, 
                                "Disease" = factor(sapply(disease_abbrev_longer, 
                                function(x) rep(x,length(allelic.het))) %>% as.vector,
                                levels = disease_abbrev_longer[ord]) ) %>% 
  gather(Subset, Penetrance, -Disease)
  plot(ggplot(aes(x=Disease, y=Penetrance), data = penetrance_data) + 
    geom_boxplot(position = 'identity', coef = 0, na.rm = T) + coord_flip() + 
    facet_wrap(~Subset, ncol = 2) + ggtitle(sprintf("Barplot: Penetrance by Ancestry (%s)", dataset)) + 
    theme(axis.text.y=element_text(size=6), axis.text.x = element_text(angle = -20, hjust = 0.4))
  )
  plot(ggplot(aes(x=Disease, y = Subset), data = penetrance_data[c(F,F,F,F,T),]) + coord_flip() + 
    geom_tile(aes(fill = Penetrance), color = 'white') + xlab("Disease") + ylab("Ancestry") +
    scale_fill_gradient(low='white',high = 'darkblue', na.value = "grey50",
      breaks=c(0,0.25,0.5,0.75,1), labels=c("0","0.25","0.50","0.75","1.00"), limits =c(0,1)) + 
    ggtitle(sprintf("Heatmap: Max Penetrance by Ancestry (%s)", dataset)) + 
    theme_minimal() + theme(axis.ticks = element_blank()) + 
    annotate("segment", y=c(0.5,5.5,6.5), yend=c(0.5,5.5,6.5), 
             x=0.5, xend = length(disease_abbrev)+0.5) +
    annotate("segment", y=0.5, yend=6.5, 
             x=c(0.5,length(disease_abbrev)+0.5), 
             xend = c(0.5,length(disease_abbrev)+0.5))
  )
  cat("Dark gray boxes are NA: no associated variants discovered in that ancestral population.")
}
ancestry_penetrance(ah_low = 0.001, ah_high = 0.5, dataset = "1000 Genomes", range = 5)

## Dark gray boxes are NA: no associated variants discovered in that ancestral population.
ancestry_penetrance(ah_low = 0.001, ah_high = 0.5, dataset = "ExAC", range = 5)

## Dark gray boxes are NA: no associated variants discovered in that ancestral population.

3.9 Empirical CDFs for All Penetrance Plots

par(mfrow=c(1,2))
pen_1000g$Penetrance[c(F,F,F,F,T)] %>% ecdf %>% 
  plot(ylab = "Fraction with max theoretical penetrance < P", xlab = "P", 
       main = "CDF: Penetrance ~ P(V|D)", ylim = c(0,1.02), pch = 19)
pen_exac$Penetrance[c(F,F,F,F,T)] %>% ecdf %>% plot(col = 'red', add = T, pch = 1)
legend("bottomright", legend = c("ExAC","1000G"), col = c("red","black"), pch = c(1,19))

pen_ah_0.1$Penetrance[c(F,F,F,F,T)] %>% ecdf %>% 
  plot(xlab = "P", ylab = "", main = "CDF: Penetrance ~ P(D)", ylim = c(0,1.02), pch = 19)
pen_ah_1.0$Penetrance[c(F,F,F,F,T)] %>% ecdf %>% plot(col = 'red', add = T, pch = 1)
legend("bottomright", legend = c("P(V|D) = 0.1","P(V|D) = 1.0"), col = c("black","red"), pch = c(19,1))

pen_1000g_max$Penetrance[c(F,F,F,F,T)] %>% ecdf %>% 
  plot(ylab = "Fraction with max theoretical penetrance < P", xlab = "P", 
       main = "CDF: Penetrance ~ P(V|D) AND P(D)", ylim = c(0,1.02), pch = 19)
pen_exac_max$Penetrance[c(F,F,F,F,T)] %>% ecdf %>% plot(col = 'red', add = T, pch = 1)
legend("bottomright", legend = c("ExAC","1000G"), col = c("red","black"), pch = c(1,19))
par(mfrow=c(1,1), mar=c(5, 4, 4, 2)+0.1)

3.10 Comparing Mean Penetrance between ExAC and 1000 Genomes

penetrance_data <- data.frame(Disease = disease_abbrev_longer, 
                              Penetrance_1000_Genomes = pen_1000g_max$Penetrance[c(F,F,T,F,F)], 
                              Penetrance_ExAC = pen_exac_max$Penetrance[c(F,F,T,F,F)]) %>%
  filter(!is.na(Penetrance_1000_Genomes) & !is.na(Penetrance_ExAC))
ggplot(aes(x = Penetrance_1000_Genomes, y = Penetrance_ExAC), data = penetrance_data) + 
  geom_point(stat = "identity", col = 'red') + 
  ggtitle("Penetrance Means: ExAC v. 1000 Genomes (log-log scale)") + 
  geom_text_repel(aes(label = Disease), size = 3) +
  scale_x_continuous(trans='log10', breaks = 10^-(0:6)) + 
  scale_y_continuous(trans='log10', breaks = 10^-(0:6)) + 
  geom_smooth(method = "lm", formula=y~x-1) + 
  xlab("Penetrance (1000 Genomes)") + ylab("Penetrance (ExAC)")

lm_compare <- lm(Penetrance_1000_Genomes ~ Penetrance_ExAC, penetrance_data)

The Pearson correlation is 0.76.
Max penetrance values computed using 1000 Genomes are 0.944-fold larger than those computed using ExAC.